/*

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

package com.bff.gaia.unified.sdk.schemas.transforms;



import com.google.auto.value.AutoValue;

import com.bff.gaia.unified.sdk.annotations.Experimental;

import com.bff.gaia.unified.sdk.annotations.Experimental.Kind;

import com.bff.gaia.unified.sdk.schemas.Schema;

import com.bff.gaia.unified.sdk.transforms.DoFn;

import com.bff.gaia.unified.sdk.transforms.PTransform;

import com.bff.gaia.unified.sdk.transforms.ParDo;

import com.bff.gaia.unified.sdk.transforms.SerializableFunction;

import com.bff.gaia.unified.sdk.values.PCollection;

import com.bff.gaia.unified.sdk.values.Row;

import com.bff.gaia.unified.vendor.guava.com.google.common.collect.Lists;



import java.util.List;



/**

 * A {@link PTransform} to unnest nested rows.

 *

 * <p>For example, consider a Row with the following nestedschema:

 *

 * <p>UserEvent Schema: userid: INT64 timestamp: DATETIME location: LatLong

 *

 * <p>LatLong Schema: latitude: DOUBLE longitude: DOUBLE

 *

 * <p>After unnesting, all of the rows will be converted to rows satisfying the following schema:

 *

 * <p>UserEvent Schema: userid: INT64 timestamp: DATETIME location.latitude: DOUBLE

 * location.longitude: DOUBLE

 *

 * <p>By default nested names are concatenated to generated the unnested name, however {@link

 * Unnest.Inner#withFieldNameFunction} can be used to specify a custom naming policy.

 *

 * <p>Note that currently array and map values are not unnested.

 */

@Experimental(Kind.SCHEMAS)

public class Unnest {

  public static <T> Inner<T> create() {

    return new AutoValue_Unnest_Inner.Builder<T>().setFieldNameFunction(CONCAT_FIELD_NAMES).build();

  }

  /**

   * This is the default naming policy for naming fields. Every field name in the path to a given

   * field is concated with _ characters.

   */

  public static final SerializableFunction<List<String>, String> CONCAT_FIELD_NAMES =

      l -> {

        return String.join("_", l);

      };

  /**

   * This policy keeps the raw nested field name. If two differently-nested fields have the same

   * name, unnesting will fail with this policy.

   */

  public static final SerializableFunction<List<String>, String> KEEP_NESTED_NAME =

      l -> {

        return l.get(l.size() - 1);

      };

  /** Returns the result of unnesting the given schema. The default naming policy is used. */

  static Schema getUnnestedSchema(Schema schema) {

    List<String> nameComponents = Lists.newArrayList();

    return getUnnestedSchema(schema, nameComponents, CONCAT_FIELD_NAMES);

  }

  /** Returns the result of unnesting the given schema with the given naming policy. */

  static Schema getUnnestedSchema(Schema schema, SerializableFunction<List<String>, String> fn) {

    List<String> nameComponents = Lists.newArrayList();

    return getUnnestedSchema(schema, nameComponents, fn);

  }



  private static Schema getUnnestedSchema(

      Schema schema, List<String> nameComponents, SerializableFunction<List<String>, String> fn) {

    Schema.Builder builder = Schema.builder();

    for (Schema.Field field : schema.getFields()) {

      nameComponents.add(field.getName());

      if (field.getType().getTypeName().isCompositeType()) {

        Schema nestedSchema = getUnnestedSchema(field.getType().getRowSchema(), nameComponents, fn);

        for (Schema.Field nestedField : nestedSchema.getFields()) {

          builder.addField(nestedField);

        }

      } else {

        String name = fn.apply(nameComponents);

        Schema.Field newField = field.toBuilder().setName(name).build();

        builder.addField(newField);

      }

      nameComponents.remove(nameComponents.size() - 1);

    }

    return builder.build();

  }

  /** Unnest a row. */

  static Row unnestRow(Row input, Schema unnestedSchema) {

    Row.Builder builder = Row.withSchema(unnestedSchema);

    unnestRow(input, builder);

    return builder.build();

  }



  private static void unnestRow(Row input, Row.Builder output) {

    for (int i = 0; i < input.getSchema().getFieldCount(); ++i) {

      Schema.Field field = input.getSchema().getField(i);

      if (field.getType().getTypeName().isCompositeType()) {

        unnestRow(input.getRow(i), output);

      } else {

        output.addValue(input.getValue(i));

      }

    }

  }

  /** A {@link PTransform} that unnests nested row. */

  @AutoValue

  public abstract static class Inner<T> extends PTransform<PCollection<T>, PCollection<Row>> {

    abstract Builder<T> toBuilder();



    @AutoValue.Builder

    abstract static class Builder<T> {

      abstract Builder<T> setFieldNameFunction(SerializableFunction<List<String>, String> fn);



      abstract Inner<T> build();

    };



    abstract SerializableFunction<List<String>, String> getFieldNameFunction();

    /**

     * Sets a policy for naming deeply-nested fields.

     *

     * <p>This is needed to prevent name collisions when differently-nested fields have the same

     * name. The default is to use the {@link #CONCAT_FIELD_NAMES} strategy that concatenates all

     * names in the path to generate the unnested name. For example, an unnested name might be

     * field1_field2_field3. In some cases the {@link #KEEP_NESTED_NAME} strategy can be used to

     * keep only the most-deeply nested name. However if this results in conflicting names (e.g. if

     * a schema has two subrows that each have the same schema this will happen), the pipeline will

     * fail at construction time.

     *

     * <p>An example of using this function to customize the separator character:

     *

     * <pre>{@code

     * pc.apply(Unnest.<Type>create().withFieldNameFunction(l -> Strings.join("+", l)));

     * }</pre>

     */

    public Inner<T> withFieldNameFunction(SerializableFunction<List<String>, String> fn) {

      return toBuilder().setFieldNameFunction(fn).build();

    }



    @Override

    public PCollection<Row> expand(PCollection<T> input) {

      Schema inputSchema = input.getSchema();

      Schema outputSchema = getUnnestedSchema(inputSchema, getFieldNameFunction());

      return input

          .apply(

              ParDo.of(

                  new DoFn<T, Row>() {

                    @ProcessElement

                    public void processElement(@Element Row row, OutputReceiver<Row> o) {

                      o.output(unnestRow(row, outputSchema));

                    }

                  }))

          .setRowSchema(outputSchema);

    }

  }

}